%matplotlib inline
#%pprint ON
import sys
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
data_dict = pickle.load(open("final_project_dataset.pkl", "r") )
df = pd.DataFrame.from_dict(data_dict)
df = df.transpose()
df = df.replace('NaN', np.nan)
df=df.dropna(thresh=2,axis=0)
df.drop('email_address', 1,inplace=True)
df.drop('TOTAL', 0,inplace=True)
df.drop('THE TRAVEL AGENCY IN THE PARK', 0,inplace=True)
df = df.convert_objects(convert_numeric=True)
#df = df.apply(lambda x: x.fillna(x.mean()),axis=0)
# Uncomment to plot all features.... This was used to review and eliminate outliers.
for column in list(df):
if column not in ['email_address','poi','director_fees']:
if column not in ['email_address','poi']:
print df[[column]].sort([column], ascending=[0]).head(3)
plt.figure(figsize=(26,6))
#print df.color
df[column].plot(style='.')
x = range(len(df[column]))
plt.xticks(x,df.index)
locs, labels = plt.xticks()
#print labels
plt.setp(labels, rotation=90)
plt.title(column)
plt.show()
features_list = ['poi']
all_features = list(df)
all_features.remove('poi')
features_list.extend(all_features)
#my_dataset = df.T.to_dict('dict')
### Extract features and labels from dataset for local testing
#data = featureFormat(my_dataset, features_list, sort_keys = True)
df.loc[df['poi'] == True, 'color'] = 'r'
df.loc[df['poi'] == False, 'color'] = 'b'
for x in all_features:
if x != 'color':
for y in all_features:
if y != 'color':
#print x,y
plt.scatter(x=df[x],y=df[y],c=df['color'])
plt.title(x + ' vs ' + y)
plt.xlabel(x)
plt.ylabel(y)
plt.show()